{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to NLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import bcolz\n",
    "import re\n",
    "import itertools\n",
    "from numpy.random import random, permutation, randn, normal, uniform, choice\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment.\n",
    "\n",
    "See below to download the dataset.\n",
    "\n",
    "We will compare to the 2011 ACL [stanford paper](http://ai.stanford.edu/~amaas/data/sentiment/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#need to be done only once\n",
    "#!wget -O $data_imdb https://s3.amazonaws.com/text-datasets/imdb_full.pkl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#to be modified\n",
    "data_folder = '/home/lelarge/courses/data/imdb/'\n",
    "data_imdb = data_folder+'imdb_full.pkl'\n",
    "data_idx = data_folder+'idx.pkl'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Embeddings\n",
    "\n",
    "We start with a small recap about [embeddings](https://pytorch.org/docs/master/nn.html#embedding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "arr = np.array([[1,2,4,5],[4,3,2,0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 4, 5])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr[0,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2, 4)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[-0.4097,  0.5366, -0.1598],\n",
       "         [-1.0705,  1.9416,  1.5148],\n",
       "         [-3.3320, -0.8998, -0.2446],\n",
       "         [-0.2292,  0.8146, -0.2218]],\n",
       "\n",
       "        [[-3.3320, -0.8998, -0.2446],\n",
       "         [-0.3514,  1.6110, -0.2107],\n",
       "         [-1.0705,  1.9416,  1.5148],\n",
       "         [ 1.7622,  0.5130, -0.0972]]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embedding_dim = 3\n",
    "embedding_user = nn.Embedding(6, embedding_dim)\n",
    "input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])\n",
    "ex = embedding_user(input)\n",
    "ex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 4, 3])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ex.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[-0.4097,  0.5366, -0.1598, -1.0705,  1.9416,  1.5148, -3.3320,\n",
       "         -0.8998, -0.2446, -0.2292,  0.8146, -0.2218],\n",
       "        [-3.3320, -0.8998, -0.2446, -0.3514,  1.6110, -0.2107, -1.0705,\n",
       "          1.9416,  1.5148,  1.7622,  0.5130, -0.0972]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ex.view(2,12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[-0.4097,  0.5366, -0.1598],\n",
       "         [-1.0705,  1.9416,  1.5148],\n",
       "         [-3.3320, -0.8998, -0.2446],\n",
       "         [-0.2292,  0.8146, -0.2218]],\n",
       "\n",
       "        [[-3.3320, -0.8998, -0.2446],\n",
       "         [-0.3514,  1.6110, -0.2107],\n",
       "         [-1.0705,  1.9416,  1.5148],\n",
       "         [ 1.7622,  0.5130, -0.0972]]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[-0.4097, -1.0705, -3.3320, -0.2292],\n",
       "         [ 0.5366,  1.9416, -0.8998,  0.8146],\n",
       "         [-0.1598,  1.5148, -0.2446, -0.2218]],\n",
       "\n",
       "        [[-3.3320, -0.3514, -1.0705,  1.7622],\n",
       "         [-0.8998,  1.6110,  1.9416,  0.5130],\n",
       "         [-0.2446, -0.2107,  1.5148, -0.0972]]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ex.permute(0,2,1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment analysis\n",
    "\n",
    "You will build a simple neural network able to classify texts (reviews of movies) in two classes: positive or negative."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "f = open(data_imdb,'rb')\n",
    "(x_train, labels_train), (x_test, labels_test) = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "g = open(data_idx, 'rb')\n",
    "idx = pickle.load(g)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1869"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx.get('river')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The words are sorted according to their frequency, i.e. the most frequent word has the lowest value.\n",
    "\n",
    "This is the word list:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx_arr = sorted(idx, key=idx.get)\n",
    "idx_arr[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the mapping from id to word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "idx2word = {v: k for k, v in idx.items()}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here's the 1st review. As you see, the words have been replaced by ids."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'23022, 309, 6, 3, 1069, 209, 9, 2175, 30, 1, 169, 55, 14, 46, 82, 5869, 41, 393, 110, 138, 14, 5359, 58, 4477, 150, 8, 1, 5032, 5948, 482, 69, 5, 261, 12, 23022, 73935, 2003, 6, 73, 2436, 5, 632, 71, 6, 5359, 1, 25279, 5, 2004, 10471, 1, 5941, 1534, 34, 67, 64, 205, 140, 65, 1232, 63526, 21145, 1, 49265, 4, 1, 223, 901, 29, 3024, 69, 4, 1, 5863, 10, 694, 2, 65, 1534, 51, 10, 216, 1, 387, 8, 60, 3, 1472, 3724, 802, 5, 3521, 177, 1, 393, 10, 1238, 14030, 30, 309, 3, 353, 344, 2989, 143, 130, 5, 7804, 28, 4, 126, 5359, 1472, 2375, 5, 23022, 309, 10, 532, 12, 108, 1470, 4, 58, 556, 101, 12, 23022, 309, 6, 227, 4187, 48, 3, 2237, 12, 9, 215'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "', '.join(map(str, x_train[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The ids can be looked up in idx2word."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'bromwell'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx2word[23022]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't\""
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join([idx2word[o] for o in x_train[0]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reduce vocab size by setting rare words to max index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vocab_size = 5000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]\n",
    "test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Look at distribution of lengths of sentences."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2493, 10, 237.71364)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lens = np.array(list(map(len, trn)))\n",
    "(lens.max(), lens.min(), lens.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pad (with zero) or truncate each sentence to make consistent length."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "seq_len = 500\n",
    "\n",
    "trn_pad = np.zeros((len(x_train),seq_len)).astype(np.int32)\n",
    "test_pad = np.zeros((len(x_test),seq_len)).astype(np.int32)\n",
    "\n",
    "for idx, s in enumerate(trn):\n",
    "    trunc = s[-seq_len:]\n",
    "    trn_pad[idx, -len(trunc):] = trunc\n",
    "\n",
    "for idx, s in enumerate(test):\n",
    "    trunc = s[-seq_len:]\n",
    "    test_pad[idx, -len(trunc):] = trunc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  0,   0,   0, ...,  12,   9, 215],\n",
       "       [  0,   0,   0, ...,   5, 336, 406],\n",
       "       [  0,   0,   0, ...,   6, 176, 397],\n",
       "       [  0,   0,   0, ...,  14,   3, 482],\n",
       "       [  0,   0,   0, ...,  65, 528,  70]], dtype=int32)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trn_pad[:5,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 500)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trn_pad.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 500)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pad.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using gpu: False \n"
     ]
    }
   ],
   "source": [
    "def gpu(tensor, gpu=False):\n",
    "    if gpu:\n",
    "        return tensor.cuda()\n",
    "    else:\n",
    "        return tensor\n",
    "\n",
    "use_gpu = torch.cuda.is_available()\n",
    "print('Using gpu: %s ' % use_gpu)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 1, 1, 1, 1]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels_test[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Single hidden layer NN\n",
    "\n",
    "Complete the code below, use `nn.Embedding, squeeze` and the usual `nn.Linear, F.relu, F.sigmoid, F.dropout`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class FirstModel(nn.Module):\n",
    "    \n",
    "    def __init__(self,\n",
    "                 embedding_dim=30,vocab_size = 1,seq_len = 1):\n",
    "        \n",
    "        super(FirstModel, self).__init__()\n",
    "        \n",
    "        self._seq_len = seq_len\n",
    "        self._embedding_dim = embedding_dim\n",
    "        self._vocab_size = vocab_size\n",
    "\n",
    "        #\n",
    "        # your code here\n",
    "        #\n",
    "        \n",
    "    def forward(self, words_id):\n",
    "        \n",
    "        #\n",
    "        # your code here\n",
    "        #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "net1 = FirstModel(embedding_dim = 32, vocab_size = vocab_size, seq_len=seq_len)\n",
    "learning_rate = 1e-3\n",
    "optimizer = torch.optim.Adam(net1.parameters(),lr=learning_rate, weight_decay=0)\n",
    "loss_fn = torch.nn.BCELoss()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Code the training loop and the test."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "batch_size = 64\n",
    "def minibatch_sentences(batch_size, word, sent):\n",
    "    for i in range(0, len(sent), batch_size):\n",
    "        yield tuple([word[i:i+batch_size,:], sent[i:i+batch_size]])\n",
    "        \n",
    "def shuffle_sentences(word,sent):\n",
    "    random_state = np.random.RandomState()\n",
    "    shuffle_indices = np.arange(len(sent))\n",
    "    random_state.shuffle(shuffle_indices)\n",
    "    return tuple([word[shuffle_indices,:], sent[shuffle_indices]])\n",
    "\n",
    "def accuracy_one(x):\n",
    "    return x[:,0] < 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def test(net, word_ids, sentiment):\n",
    "    net.train(False)\n",
    "    word_ids = word_ids.astype(np.int64)\n",
    "    word_ids_tensor = gpu(torch.from_numpy(word_ids), use_gpu)\n",
    "    sent_tensor = gpu(torch.from_numpy(np.asarray(sentiment).astype(np.float32)), use_gpu)\n",
    "    epoch_loss = 0.0\n",
    "    epoch_acc = 0.0\n",
    "    for (minibatch_num, (batch_word, batch_sent)) in enumerate(minibatch_sentences(batch_size, word_ids_tensor, sent_tensor)):\n",
    "        #\n",
    "        # your code here\n",
    "        #\n",
    "\n",
    "    return epoch_loss, epoch_acc\n",
    "\n",
    "def fit(net,word_ids, sentiment, word_ids_test, sentiment_test, n_iter = 3, verbose=True):\n",
    "    word_ids = word_ids.astype(np.int64)\n",
    "    word_ids_test = word_ids_test.astype(np.int64)\n",
    "    net.train(True)\n",
    "    \n",
    "    for epoch_num in range(n_iter):\n",
    "        words, sents = shuffle_sentences(word_ids,np.asarray(sentiment).astype(np.float32))\n",
    "        word_ids_tensor = gpu(torch.from_numpy(words), use_gpu)\n",
    "        sent_tensor = gpu(torch.from_numpy(sents), use_gpu)\n",
    "        epoch_loss = 0.0\n",
    "        epoch_acc = 0.0\n",
    "        for (minibatch_num, (batch_word, batch_sent)) in enumerate(minibatch_sentences(batch_size, word_ids_tensor, sent_tensor)):\n",
    "            #\n",
    "            # your code here\n",
    "            #\n",
    "\n",
    "        if verbose:\n",
    "            val_loss, val_acc = test(net,word_ids_test, sentiment_test)\n",
    "            print('Epoch {}: train loss {}'.format(epoch_num, epoch_loss), 'train acc', epoch_acc,'validation loss', val_loss,'validation acc', val_acc)\n",
    "            model.train(True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "fit(net1,trn_pad, labels_train, test_pad, labels_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The importance of right initialization.\n",
    "\n",
    "Implement a scaled embedding implementing [Xavier initialization](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ScaledEmbedding(nn.Embedding):\n",
    "    \"\"\"\n",
    "    Embedding layer that initialises its values\n",
    "    to using a normal variable scaled by the inverse\n",
    "    of the emedding dimension.\n",
    "    \"\"\"\n",
    "\n",
    "    def reset_parameters(self):\n",
    "        \"\"\"\n",
    "        Initialize parameters.\n",
    "        \"\"\"\n",
    "        #\n",
    "        # your code here\n",
    "        #"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "fit(net1,trn_pad, labels_train, test_pad, labels_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Single conv layer with max pooling\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Add a first 1d-convolutional layer with maxpooling.\n",
    "\n",
    "Here is an example of [architecture](https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GloVe Embedding\n",
    "\n",
    "[Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/) by  Jeffrey Pennington,   Richard Socher,   Christopher D. Manning "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "glove_folder = '/home/lelarge/courses/data/glove/'\n",
    "#glove_folder = '/home/ubuntu/data/glove/'\n",
    "glove_file = glove_folder + '6B.50d.tgz'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#need to be done only once\n",
    "#%mkdir -p $glove_folder\n",
    "#!wget -O $glove_file http://files.fast.ai/models/glove/6B.50d.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import tarfile\n",
    "tar = tarfile.open(glove_file, \"r:gz\")\n",
    "tar.extractall(glove_folder)\n",
    "tar.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def load_vectors(loc):\n",
    "    return (load_array(loc+'.dat'),\n",
    "        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),\n",
    "        pickle.load(open(loc+'_idx.pkl','rb'),encoding='latin1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def load_array(fname):\n",
    "    return bcolz.open(fname)[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "glove_loc =glove_folder+'6B.50d'\n",
    "vecs, words, wordidx = load_vectors(glove_loc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vecs.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.manifold import TSNE\n",
    "\n",
    "tsne = TSNE(n_components=2, random_state=0)\n",
    "Y = tsne.fit_transform(vecs[:500])\n",
    "\n",
    "start=0; end=350\n",
    "dat = Y[start:end]\n",
    "plt.figure(figsize=(15,15))\n",
    "plt.scatter(dat[:, 0], dat[:, 1])\n",
    "for label, x, y in zip(words[start:end], dat[:, 0], dat[:, 1]):\n",
    "    plt.text(x,y,label, color=np.random.rand(3)*0.7,\n",
    "                 fontsize=14)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_emb():\n",
    "    n_fact = vecs.shape[1]\n",
    "    emb = np.zeros((vocab_size, n_fact))\n",
    "\n",
    "    for i in range(1,len(emb)):\n",
    "        word = idx2word[i]\n",
    "        if word and re.match(r\"^[a-zA-Z0-9\\-]*$\", word):\n",
    "            src_idx = wordidx[word]\n",
    "            emb[i] = vecs[src_idx]\n",
    "        else:\n",
    "            # If we can't find the word in glove, randomly initialize\n",
    "            emb[i] = normal(scale=0.6, size=(n_fact,))\n",
    "\n",
    "    # This is our \"rare word\" id - we want to randomly initialize\n",
    "    emb[-1] = normal(scale=0.6, size=(n_fact,))\n",
    "    emb/=3\n",
    "    return emb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "emb = create_emb()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "emb.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "emb[1,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:pytorch]",
   "language": "python",
   "name": "conda-env-pytorch-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
